capture log close
log using "E:\Forschung_offline\08_becker_conjecture\Output\fstats", replace text

*****************************************
* do-file to construct major variables	*
* from micro census and his surveys		*
* fo 02.07.2015					  		*
*****************************************
version 13
clear
set more off
cd "E:\Forschung_offline\08_becker_conjecture\data\"

* replace location of census data *
local dircensus = "E:\Forschung_offline\data\Mikrozensus\"

* replace location of his surveys * 
local dirhis = "E:\Forschung_offline\data\HIS_Abs\"


**********************************
* information from German census *
**********************************

*	Pool datasets from 2007, 2008, 2009
* 2007
use "`dircensus'mz2007.dta", clear
gen id = _n
keep id EF436 EF134 EF313 EF46 EF1 EF314 EF44 EF310 EF312 EF77 EF129 EF201 EF866 EF952
gen cohort = 2007
save "mzpooled.dta", replace

* 2008
use "`dircensus'mz2008.dta", clear
gen id = _n + 500000
foreach num of numlist 436 134 313 46 1 314 44 310 312 77 129 201 866 952 {
	rename ef`num' EF`num'
	}
keep id EF436 EF134 EF313 EF46 EF1 EF314 EF44 EF310 EF312 EF77 EF129 EF201 EF866 EF952
gen cohort = 2008
append using "mzpooled.dta"
save "mzpooled.dta", replace

* 2009
use "`dircensus'mz2009.dta", clear
gen id = _n + 1000000
keep id EF436 EF134 EF313 EF46 EF1 EF314 EF44 EF310 EF312 EF77 EF129 EF201 EF866 EF952
gen cohort = 2009
recode EF201 (-5 -3 -2 = 0)
recode EF310 (-5 -3 -2 = 0)
append using "mzpooled.dta"
save "mzpooled.dta", replace


*** person characteristics (for mincer regression) ***

* wages (following Glocker/Storck 2014)
recode EF436 (50 90 99=.)(1=52)(2=231)(3=404)(4=620)(5=818)(6=1023)(7=1227) ///
	(8=1437)(9=1627)(10=1884)(11=2171)(12=2475)(13=2777)(14=3065)(15=3438) ///
	(16=3865)(17=4284)(18=4836)(19=5280)(20=5864)(21=6764)(22=9005) ///
	(23=12886)(24=21778), gen(inc_mon)

* hrs worked (top coded at 80 hrs/week)
recode EF134(99 0 -3 -2=.)(80/150=80), gen(arbzt_woch)

* hourly wage
gen wage = inc_mon / (4*arbzt_woch)
gen lnwage = ln(wage) 

* gender
recode EF46(1=0)(2=1), gen(frau)

* east germany
recode EF1(1/10=0)(11/16=1), gen(ost)

* duration since receipt of degree (in years)
replace EF314 =. if EF314 <= 0 | EF314 > 2009
gen exper = cohort - EF314
gen exper2 = exper^2

* age
gen age = EF44
gen age2 = age^2

* college degree
recode EF312 (-5 -3 0 99=.)(1/5=0)(6/10=1)(11=0), gen(hsabs)
keep if hsabs==1

* college Major
	* scheme (almost) identical with HIS but coding is not
	* therefore first step: code census into HIS coding
	* Geosciences/Geography --> Geoscience ; Regional science --> Geography
	* Teachers --> 500

recode EF313 (-5 -3 0 98 99=.)(2=2)(3=3)(5=4)(6=5)(7 8=6)(9=7)(10=8)(11=9) ///
	(12=10)(13=11)(14=12)(15=13)(16=14)(17=15)(18=16) ///
	(25=22)(26=1)(28=25)(29=44)(31=28)(32=29)(33 34 35 36 37=30) ///
	(38=31)(40 41=37)(42=38)(43=39)(44=40)(45=41)(46=42)(47=43)(48=49) ///
	(49=50)(50=51)(51=58)(52=57)(53=59)(54 55=60)(56=36)(57=62)(58 59=63) ///
	(60 61=64)(62=40)(63=65)(66=66)(67=68)(72 73=74)(74=77)(75=78)(76=76)(92=27) ///
	(4 64 65 68 69 70 71 77 78 80 81 82 84 85 86 87 88 89 90 91 93 94 95=.) ///
	(19 20 21 22 23 24=500), gen(B1ber1ab1)

* Merge small but similar majors
	* cath. theology --> prot. theology
	recode B1ber1ab1(3=2)
	* philosophy --> history
	recode B1ber1ab1(4=5)
	* non-german philologies and cultural sciences --> cultural sciences
	recode B1ber1ab1(1 7 8 10 11 12 13=14)
	* special needs pedagogy --> pedagogy
	recode B1ber1ab1(17=16)
	* social and economic studies, political science --> social sciences
	recode B1ber1ab1(23 25=26)
	* mining and metallurgy --> geosciences
	recode B1ber1ab1(62=43)
	* urban and regional planning --> geography
	recode B1ber1ab1(67 57=44)
	* dentistry --> medical sciences
	recode B1ber1ab1(50=49)
	* forestry --> agronomy
	recode B1ber1ab1(59=58)
	* design --> architecture and interior design
	recode B1ber1ab1(76=66)
	* surveying and mapping, traffic engineering --> civil engineering
	recode B1ber1ab1 (65 69=68)
	* fine arts, performing arts, music, musicology --> art, art history
	recode B1ber1ab1(75 77 78=74)
	* state teacher
	recode B1ber1ab1(500=80)

tab B1ber1ab1

* Only the fields with sufficient observations in NEPS 
keep if (B1ber1ab1 ==5 | B1ber1ab1 ==14 | B1ber1ab1 ==15 | B1ber1ab1 ==16 | ///
	B1ber1ab1 ==26 | B1ber1ab1 ==27 | B1ber1ab1 ==28 | B1ber1ab1 ==30 | B1ber1ab1 ==31 | ///
	B1ber1ab1 ==37 | B1ber1ab1 ==38 | B1ber1ab1 ==39 | B1ber1ab1 ==40 | B1ber1ab1 ==42 | ///
	B1ber1ab1 ==44 | B1ber1ab1 ==49 | B1ber1ab1 ==58 | B1ber1ab1 ==63 | B1ber1ab1 ==64 | ///
	B1ber1ab1 ==66 | B1ber1ab1 ==68 | B1ber1ab1 ==74 | B1ber1ab1 ==80)

* full university vs. univ. of applied sciences
recode EF312 (-5 -3 0 99=.)(9/10=1)(1/8=0)(11=0), gen(univ)

* full abitur or not
recode EF310 (-5 -3 0 9=.)(5=1)(1 2 3 4 6=0), gen(abi)

* phd
recode EF312 (-5 -3 0 99=.)(10=1)(1/9 11=0), gen(promo)
replace promo = 0 if (B1ber1ab1 == 49 | B1ber1ab1 == 50) 	/* MDs and dentists do not hold real PhDs */

* sample
gen coh2007=0
gen coh2008=0
gen coh2009=0
replace coh2007=1 if cohort==2007
replace coh2008=1 if cohort==2008
replace coh2009=1 if cohort==2009

* gen fulltime
gen fulltime=0
replace fulltime=1 if arbzt_woch>=35 


*** define sample ***
drop if hsabs !=1 						/* without college degree 		*/
drop if wage<2							/* implausibly low wages 		*/
drop if arbzt_woch<20					/* less than 20 hrs per week 	*/ 
drop if EF77!=1 						/* no paid employment 			*/
drop if EF314 - (cohort - age) > 35 	/* age at reception of degree not between 18 and 35 */ 
drop if EF314 - (cohort - age) < 18 
drop if EF201>0 						/* more than one job 			*/
keep if age >= 25 & age <=55  			/* age not between 25 and 55 	*/
egen miss_ind = rowmiss(lnwage cohort frau ost exper exper2 age age2 B1ber1ab1 univ abi promo)
drop if miss_ind!=0						/* listwise deletion 			*/

qui tab B1ber1ab1, gen(fachdum)
global fachnr = r(r)


***************************************************
***		compute major level characteristics		***
***************************************************

/*** results matrix 1 (fstats)
	 1 - major id
	 2 - major size
	 3 - proportion female (mean)
	 4 - proportion female (se)
 human capital Theory
	 5 - worklife index (mean)
	 6 - worklife index (se)
	 7 - hrs worked if full time job (mean)
	 8 - hrs worked if full time job (se)
 anticipation of discrimination
	 9  - perceived discrimination (mean of women's responses)
	10  - perceived discrimination (se)
 gender roles: breadwinner 
	11 - wage level
	12 - wage level (se)
***/

matrix fstats = J($fachnr , 12 , .)
matrix colnames fstats = 1_B1ber1ab1 2_obs 3_%frau 4_%frau_se 5_worklife 6_worklife_se ///
	7_ftarbzt 8_ftarbzt_se 9_%sdiscr 10_%sdiscr_se 11_intercpt 12_intercpt_se

* sampling weights
svyset id[pw=EF952]
	
qui forvalues i = 1/$fachnr {
	summarize B1ber1ab1 if (fachdum`i'==1)
		matrix fstats [`i', 1] = r(mean)
		matrix fstats [`i', 2] = r(N)
	svy: mean frau if (fachdum`i'==1)
		matrix fstats [`i', 3] = _b[frau]
		matrix fstats [`i', 4] = _se[frau]
	svy: mean arbzt_woch if (fachdum`i'==1 & arbzt_woch>=35)
		matrix fstats [`i', 7] = _b[arbzt_woch]
		matrix fstats [`i', 8] = _se[arbzt_woch]
	}

* wage regression											
global fdum = "fachdum1-fachdum7 fachdum9-fachdum23" 										
global controls = "ost exper exper2 age age2 abi promo univ coh2008 coh2009"								

reg lnwage $fdum $controls [pw=EF952]
est sto mincer
esttab mincer using "E:\Forschung_offline\08_becker_conjecture\Output\mincer.csv", se nostar r2 replace

* plug estimates into results matrix
forvalues i = 1/7 {											/* refcat: econ sciences=8 	*/
	matrix fstats [`i',11] = (exp(_b[fachdum`i']))-1		/* intercept b  */
	matrix fstats [`i',12] = (exp(_se[fachdum`i']))-1		/* intercept se */ 
	}
forvalues i = 9/$fachnr {									/* refcat: econ sciences=8 */			
	matrix fstats [`i',11] = (exp(_b[fachdum`i']))-1		/* intercept b  */
	matrix fstats [`i',12] = (exp(_se[fachdum`i']))-1		/* intercept se */ 
	}
foreach num of numlist 11 {
	matrix fstats [8,`num'] = 0								/* refcat: econ sciences=8 */
	}
matrix list fstats, format (%8.3g)


************************************
***	information from HIS surveys ***
************************************

*** pool datasets ***
* 1997
use "`dirhis'ZA4272.dta", clear
lab def noyes 0"nein" 1"ja"
lab def noneuyes 0"nein oder neutral" 1"ja"
gen id_neu=id_suf
gen cohort2001=0
recode K1gebjahr (-1=.)
gen alter=1997-K1gebjahr
keep id_neu cohort2001 H1krgeschl K1geschl X1gewinsg B1ber1ab1 alter C2zuffam C2zufarb C2zufleb
sort id_neu
save "1997.dta" , replace
* 2001
use "`dirhis'ZA5186.dta", clear
gen id_neu=(id_suf + 6216)
gen cohort2001=1
lab def noyes 0"nein" 1"ja"
lab def noneuyes 0"nein oder neutral" 1"ja"
recode k1gebjahr (-1=.)
gen alter=2001-k1gebjahr

*** homogenize variable names ***
rename x1gewinsg X1gewinsg
rename h1krgeschl H1krgeschl
rename k1geschl K1geschl
rename b1ber1ab1 B1ber1ab1
rename b1ber2ab1 B1ber2ab1
rename b1hfabsp1 B1hfabsp1
rename c2zuffam C2zuffam
rename c2zufarb C2zufarb
rename c2zufleb C2zufleb
keep id_neu cohort2001 H1krgeschl K1geschl X1gewinsg B1ber1ab1 B1ber2ab1 B1hfabsp1 alter C2zuffam C2zufarb C2zufleb
sort id_neu
append using "1997.dta"

*** generate variables ***
* gender
recode K1geschl (-1=.)(1=0)(2=1), gen(frau)
lab var frau "Geschlecht"
lab def frau 0"Mann" 1"Frau"
lab val frau frau
drop if frau==.

* perceived gender discrimination
recode H1krgeschl (-2 -1=.)(5 4 3=0)(2 1=1), gen(sdiscr)

* satisfaction with work-life compatability
recode C2zuffam (-2 -1=.)(5=0)(4=1)(3=2)(2=3)(1=4), gen(zuffam) 
recode C2zufarb (-2 -1=.)(5=0)(4=1)(3=2)(2=3)(1=4), gen(zufarb)
recode C2zufleb (-2 -1=.)(5=0)(4=1)(3=2)(2=3)(1=4), gen(zufleb)
gen worklife = (zuffam+zufarb+zufleb)/12
replace worklife =. if zuffam==. | zufarb==. | zufleb==.


*** aggregate to major characteristics ***
recode B1hfabsp1 (1/4=0)(5/9=1)(12/19=0), gen(lehramt)
lab var lehramt "Lehramt"
lab val lehramt noyes

* identify state teachers in fields with many teachers
replace B1ber1ab1=80 if lehramt==1 		/* State School Teacher */

* merge small but similar majors
* cath. theology --> prot. theology
recode B1ber1ab1(3=2)
* philosophy --> history
recode B1ber1ab1(4=5)
* non-german philologies and cultural sciences --> cultural sciences
recode B1ber1ab1(1 7 8 9 10 11 12 13=14)
* special needs pedagogy --> pedagogy
recode B1ber1ab1(17=16)
* social and economic studies, political science --> social sciences
recode B1ber1ab1(23 25=26)
* mining and metallurgy --> geosciences
recode B1ber1ab1(62=43)
* urban and regional planning --> geography
recode B1ber1ab1(67 57=44)
* dentistry --> medical sciences
recode B1ber1ab1(50 51=49)
* forestry --> agronomy
recode B1ber1ab1(59=58)
* design --> architecture and interior design
recode B1ber1ab1(76=66)
* surveying and mapping, traffic engineering --> civil engineering
recode B1ber1ab1 (65 69=68)
* fine arts, performing arts, music, musicology --> art, art history
recode B1ber1ab1(75 77 78=74)

recode B1ber1ab1(-2 -1=.), gen(fach)
lab val fach stb
numlabel stb, add

* Only the fields with sufficient obs in NEPS 
keep if (B1ber1ab1 ==5 | B1ber1ab1 ==14 | B1ber1ab1 ==15 | B1ber1ab1 ==16 | ///
	B1ber1ab1 ==26 | B1ber1ab1 ==27 | B1ber1ab1 ==28 | B1ber1ab1 ==30 | B1ber1ab1 ==31 | ///
	B1ber1ab1 ==37 | B1ber1ab1 ==38 | B1ber1ab1 ==39 | B1ber1ab1 ==40 | B1ber1ab1 ==42 | ///
	B1ber1ab1 ==44 | B1ber1ab1 ==49 | B1ber1ab1 ==58 | B1ber1ab1 ==63 | B1ber1ab1 ==64 | ///
	B1ber1ab1 ==66 | B1ber1ab1 ==68 | B1ber1ab1 ==74 | B1ber1ab1 ==80)
	
qui tab fach, gen(fachdum)
global fachnr = r(r)
lab val fachdum1-fachdum$fachnr stb

* compute (weighted) averages by major and plug into results matrix
svyset id_neu[pw=X1gewinsg]
qui forvalues i = 1 / $fachnr {
	svy: mean worklife if (fachdum`i'==1)
		matrix fstats [`i', 5] = _b[worklife]
		matrix fstats [`i', 6] = _se[worklife]
	svy: mean sdiscr if (fachdum`i'==1 & frau)
		matrix fstats [`i', 9] = _b[sdiscr]
		matrix fstats [`i', 10] = _se[sdiscr]
		}
mat list fstats


*** reformat for transport to NEPS server ***
svmat fstats
keep fstats*
lab val fstats1 stb
lab var fstats2 "Observations"
lab var fstats3 "%female"
lab var fstats5 "work-life satisfaction"
lab var fstats7 "hrs. worked(ft)"
lab var fstats9 "discrimination vs. women"
lab var fstats11 "intercept wage"

****
gen n=_n
drop if n > $fachnr
drop n
save "fstats.dta", replace

*** END OF DOFILE ***	
log close
exit, clear
